notebook.community

Edit and run



In [1]:

    
from setup_notebooks import *
%matplotlib inline



In [2]:

    
%matplotlib inline
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_rows', 12)
pd.set_option('display.max_columns', 200)



In [29]:

    
from gensim.models import TfidfModel, LsiModel
from gensim.corpora import Dictionary
from collections import OrderedDict

Load previously cleaned data



In [3]:

    
dates = pd.read_csv(os.path.join(DATA_PATH, 'datetimes.csv.gz'), engine='python')
nums = pd.read_csv(os.path.join(DATA_PATH, 'numbers.csv.gz'), engine='python')
df = pd.read_csv(os.path.join(DATA_PATH, 'text.csv.gz'))
df.tokens









    Out[3]:





0         ['python', 'never', 'stop', 'learning', 'what'...
1                       ['Watching', 'Boa', 'vs', 'Python']
2         ['Monty', 'Python', 'The', 'silly', 'walk', 'v...
3         ['Senior', 'Software', 'Engineer', 'Full', 'St...
4         ['Architect', 'Django', 'Solr', 'Platform', 'E...
5           ['peaceful', 'rain', 'Python', 'inevitability']
                                ...                        
183064    ['Las', 'mejores', 'ides', 'para', 'Python', '...
183065    ['Gagal', 'tidur', 'gegara', 'habis', 'vertica...
183066         ['Go', 'boa', 'wkwk', 'Boa', 'vs', 'Python']
183067    ['RT', 'RealPython', 'List', 'of', 'Python', '...
183068                  ['Watching', 'Boa', 'vs', 'Python']
183069    ['Чертова', 'дюжина', 'вакансий', 'в', 'IT', '...
Name: tokens, dtype: object



In [9]:

    
d = Dictionary.from_documents(([str(s) for s in row]for row in df.tokens))



In [4]:

    
df.tokens.iloc[0]









    Out[4]:





"['python', 'never', 'stop', 'learning', 'what', 'you', 'enjoy', 'doing']"



In [ ]:

    
# one way to fix this
df.tokens = df.tokens.apply(eval)

When we said "QUOTE_NONNUMERIC" we didn't mean ALL nonnumeric fields ;)

So we can recreate the token lists usint split() again



In [13]:

    
df['tokens'] = df.txt.str.split()
df.tokens









    Out[13]:





0         [python, never, stop, learning, what, you, enj...
1                               [Watching, Boa, vs, Python]
2           [Monty, Python, The, silly, walk, via, YouTube]
3         [Senior, Software, Engineer, Full, Stack, Pyth...
4         [Architect, Django, Solr, Platform, Engineer, ...
5                   [peaceful, rain, Python, inevitability]
                                ...                        
183064    [Las, mejores, ides, para, Python, Antes, de, ...
183065    [Gagal, tidur, gegara, habis, vertical, limit,...
183066                     [Go, boa, wkwk, Boa, vs, Python]
183067    [RT, RealPython, List, of, Python, API, Wrappe...
183068                          [Watching, Boa, vs, Python]
183069    [Чертова, дюжина, вакансий, в, IT, и, Digital,...
Name: tokens, dtype: object

That's more like it, our tokens are now lists of strings not stringified lists of strings ;)



In [16]:

    
df.tokens.values[0:3]









    Out[16]:





array([['python', 'never', 'stop', 'learning', 'what', 'you', 'enjoy', 'doing'],
       ['Watching', 'Boa', 'vs', 'Python'],
       ['Monty', 'Python', 'The', 'silly', 'walk', 'via', 'YouTube']], dtype=object)



In [17]:

    
d = Dictionary.from_documents(df.tokens)
d









    Out[17]:





<gensim.corpora.dictionary.Dictionary at 0x7f523743de80>



In [18]:

    
tfidf = TfidfModel(d)









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-18-af38a11fda4c> in <module>()
----> 1 tfidf = TfidfModel(d)

/home/hobs/.virtualenvs/twip3/lib/python3.5/site-packages/gensim/models/tfidfmodel.py in __init__(self, corpus, id2word, dictionary, wlocal, wglobal, normalize)
     94             self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
     95         elif corpus is not None:
---> 96             self.initialize(corpus)
     97         else:
     98             # NOTE: everything is left uninitialized; presumably the model will

/home/hobs/.virtualenvs/twip3/lib/python3.5/site-packages/gensim/models/tfidfmodel.py in initialize(self, corpus)
    116             if docno % 10000 == 0:
    117                 logger.info("PROGRESS: processing document #%i" % docno)
--> 118             numnnz += len(bow)
    119             for termid, _ in bow:
    120                 dfs[termid] = dfs.get(termid, 0) + 1

TypeError: object of type 'int' has no len()

Hint-Hint: gensim is sprinting this week at PyCon!



In [19]:

    
TfidfModel?



In [20]:

    
TfidfModel(df.txt)









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-20-d3b6002038b1> in <module>()
----> 1 TfidfModel(df.txt)

/home/hobs/.virtualenvs/AgileMachineLearning/lib/python3.5/site-packages/gensim/models/tfidfmodel.py in __init__(self, corpus, id2word, dictionary, wlocal, wglobal, normalize)
     94             self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
     95         elif corpus is not None:
---> 96             self.initialize(corpus)
     97         else:
     98             # NOTE: everything is left uninitialized; presumably the model will

/home/hobs/.virtualenvs/AgileMachineLearning/lib/python3.5/site-packages/gensim/models/tfidfmodel.py in initialize(self, corpus)
    117                 logger.info("PROGRESS: processing document #%i" % docno)
    118             numnnz += len(bow)
--> 119             for termid, _ in bow:
    120                 dfs[termid] = dfs.get(termid, 0) + 1
    121 

ValueError: not enough values to unpack (expected 2, got 1)



In [21]:

    
TfidfModel(df.tokens)









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-21-ddd5365ea7ab> in <module>()
----> 1 TfidfModel(df.tokens)

/home/hobs/.virtualenvs/AgileMachineLearning/lib/python3.5/site-packages/gensim/models/tfidfmodel.py in __init__(self, corpus, id2word, dictionary, wlocal, wglobal, normalize)
     94             self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
     95         elif corpus is not None:
---> 96             self.initialize(corpus)
     97         else:
     98             # NOTE: everything is left uninitialized; presumably the model will

/home/hobs/.virtualenvs/AgileMachineLearning/lib/python3.5/site-packages/gensim/models/tfidfmodel.py in initialize(self, corpus)
    117                 logger.info("PROGRESS: processing document #%i" % docno)
    118             numnnz += len(bow)
--> 119             for termid, _ in bow:
    120                 dfs[termid] = dfs.get(termid, 0) + 1
    121 

ValueError: too many values to unpack (expected 2)



In [10]:

    
TfidfModel((d.doc2bow(tokens) for tokens in df.tokens))









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-10-40aa3d3f837d> in <module>()
----> 1 TfidfModel((d.doc2bow(tokens) for tokens in df.tokens))

/home/hobs/.virtualenvs/twip3/lib/python3.5/site-packages/gensim/models/tfidfmodel.py in __init__(self, corpus, id2word, dictionary, wlocal, wglobal, normalize)
     94             self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
     95         elif corpus is not None:
---> 96             self.initialize(corpus)
     97         else:
     98             # NOTE: everything is left uninitialized; presumably the model will

/home/hobs/.virtualenvs/twip3/lib/python3.5/site-packages/gensim/models/tfidfmodel.py in initialize(self, corpus)
    113         dfs = {}
    114         numnnz, docno = 0, -1
--> 115         for docno, bow in enumerate(corpus):
    116             if docno % 10000 == 0:
    117                 logger.info("PROGRESS: processing document #%i" % docno)

<ipython-input-10-40aa3d3f837d> in <genexpr>(.0)
----> 1 TfidfModel((d.doc2bow(tokens) for tokens in df.tokens))

/home/hobs/.virtualenvs/twip3/lib/python3.5/site-packages/gensim/corpora/dictionary.py in doc2bow(self, document, allow_update, return_missing)
    139         """
    140         if isinstance(document, string_types):
--> 141             raise TypeError("doc2bow expects an array of unicode tokens on input, not a single string")
    142 
    143         # Construct (word, frequency) mapping.

TypeError: doc2bow expects an array of unicode tokens on input, not a single string

But there's a simpler way.
We already have a vocabulary
with term and document frequencies in a matrix...



In [15]:

    
pd.Series(d.dfs)









    Out[15]:





0          444
1         1658
2        53491
3          611
4         9048
5         2374
         ...  
87141        1
87142        1
87143        1
87144        1
87145        1
87146        1
dtype: int64



In [16]:

    
pd.Series(d.iteritems())









    Out[16]:





0        ((3906, OBJ), (27629, MyLife), (34812, vacancy...
1        ((3906, OBJ), (27629, MyLife), (34812, vacancy...
2        ((3906, OBJ), (27629, MyLife), (34812, vacancy...
3        ((3906, OBJ), (27629, MyLife), (34812, vacancy...
4        ((3906, OBJ), (27629, MyLife), (34812, vacancy...
5        ((3906, OBJ), (27629, MyLife), (34812, vacancy...
                               ...                        
87141    ((3906, OBJ), (27629, MyLife), (34812, vacancy...
87142    ((3906, OBJ), (27629, MyLife), (34812, vacancy...
87143    ((3906, OBJ), (27629, MyLife), (34812, vacancy...
87144    ((3906, OBJ), (27629, MyLife), (34812, vacancy...
87145    ((3906, OBJ), (27629, MyLife), (34812, vacancy...
87146    ((3906, OBJ), (27629, MyLife), (34812, vacancy...
dtype: object

OK, now I get it

document is a list of strings (ordered sequence of tokens)
bow or [bag of words] is a list of Counter-like mappings between word IDs and their count in each document
TfidfModel is a transformation from a BOW into a BORF, a "bag of relative frequencies"

TFIDF = BORF = term frequencies normalized by document occurence counts



In [21]:

    
pd.Series(d.doc2bow(toks) for toks in df.tokens[:6])









    Out[21]:





0    [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...
1                   [(8, 1), (9, 1), (10, 1), (11, 1)]
2    [(9, 1), (12, 1), (13, 1), (14, 1), (15, 1), (...
3    [(9, 1), (18, 1), (19, 1), (20, 1), (21, 1), (...
4    [(9, 1), (19, 1), (20, 1), (21, 1), (23, 1), (...
5                  [(9, 1), (37, 1), (38, 1), (39, 1)]
dtype: object

Did it assign 0 to the first word it found?
Sort-of...



In [22]:

    
d.token2id['python']









    Out[22]:





0



In [23]:

    
d.token2id['Python']









    Out[23]:





9



In [24]:

    
d.token2id['you']









    Out[24]:





2



In [26]:

    
d[1]  # guesses anyone?









    Out[26]:





'what'



In [ ]:



In [27]:

    
tfidf = TfidfModel(dictionary=d)
tfidf









    Out[27]:





<gensim.models.tfidfmodel.TfidfModel at 0x7f52402b0cf8>



In [30]:

    
dfs = pd.Series(OrderedDict(sorted([(d.id2token[i], numdocs) for (i, numdocs) in tfidf.dfs.items()])))
dfs









    Out[30]:





A           7338
AA             1
AAA            2
AAAA           1
AAAAAA         1
AAAAAAND       2
            ... 
ＴＨＥ            1
Ｗ              2
ＷＡＮＴ           3
ＷＡＲＮＩＮＧ        1
ＹＯＵ           10
𝓩Ᏸ             1
dtype: int64



In [27]:

    
dfs.iloc[4000:4030]









    Out[27]:





Bioinformatics    20
Biological         3
Biologist          2
Biologists        13
Biology           17
Biomechanics       1
                  ..
Birkenstocks       1
Birkin             2
Birman            56
Birmann            1
Birmingham        11
Birth             12
dtype: int64



In [28]:

    
tfidf.num_docs









    Out[28]:





183070



In [29]:

    
tfidf.num_nnz









    Out[29]:





2392557



In [30]:

    
tfidf.save(os.path.join(DATA_PATH, 'tfidf'))



In [31]:

    
tfidf2 = TfidfModel.load(os.path.join(DATA_PATH, 'tfidf'))



In [32]:

    
tfidf2.num_nnz









    Out[32]:





2392557



In [ ]: